package cn.itmtr.document.download.common.util;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.crypto.digest.MD5;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpStatus;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson.JSON;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

import static cn.itmtr.document.download.common.util.UrlPropertyTypeEnum.*;

/**
 * 网页下载工具类
 *
 * @author mtr
 * @since 2021-10-04
 */
@Slf4j
public class HtmlDownloadUtil {

    public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36";

    public static final String URL_REGEX = "(https://|http://|//|/)([\\w-]+\\.)+[\\w-]+(:\\d+)*(/[\\w- ./?%&=]*)?";

    /**
     * 获取网页html内容
     *
     * @param url 网页地址
     * @return java.lang.String 返回网页内容
     * @author mtr
     * @date 2021/10/4
     */
    public static String html(String url) {
        HttpResponse response = HttpUtil.createGet(url, true).header("User-Agent", USER_AGENT).execute();
        if (HttpStatus.HTTP_OK == response.getStatus()) {
            return response.body();
        }
        log.error("获取[{}]网页内容失败: {}", url, response.body());
        return "";
    }

    /**
     * 下载资源致指定路径
     *
     * @param url  链接地址
     * @param dest 保存路径
     * @author mtr
     * @date 2021/10/4
     */
    public static void downloadSource(String url, String dest, String referer) {
        HttpResponse response = HttpUtil.createGet(url, true)
                .header("User-Agent", USER_AGENT)
                .header("Referer", referer)
                .execute();
        response.writeBody(dest);
    }

    /**
     * 获取网页中所有链接
     *
     * @param htmlContent 网页内容
     * @return java.util.List<java.lang.String>
     * @author mtr
     * @date 2021/10/4
     */
    public static List<String> getAllUrl(String htmlContent) {
        return ReUtil.findAllGroup0(URL_REGEX, htmlContent);
    }

    /**
     * 获取访问链接
     *
     * @param url        原链接
     * @param protocol   协议 http 或 https
     * @param domain     域名
     * @param currentUri 当前路径
     * @return java.lang.String
     * @author mtr
     * @date 2021/10/4
     */
    public static String getVisitUrl(String url, String protocol, String domain, String currentUri) {
        if (StrUtil.isBlank(currentUri)) {
            currentUri = "/";
        } else {
            int lastIndex = currentUri.lastIndexOf("/");
            if (lastIndex > 0) {
                if (currentUri.substring(lastIndex).contains(".")) {
                    currentUri = currentUri.substring(0, lastIndex + 1);
                } else if (!currentUri.endsWith("/")) {
                    currentUri += "/";
                }
            }
        }
        if (HttpUtil.isHttp(url) || HttpUtil.isHttps(url)) {
            return url;
        } else if (url.startsWith("//")) {
            return protocol + ":" + url;
        } else if (url.startsWith("/")) {
            return protocol + "://" + domain + url;
        } else if (url.startsWith("../")) {
            while (url.startsWith("../")) {
                int towIndex = StrUtil.ordinalIndexOf(StrUtil.reverse(currentUri), "/", 2);
                if (towIndex > 0) {
                    towIndex = currentUri.length() - towIndex;
                    if (towIndex > 0) {
                        currentUri = currentUri.substring(0, towIndex);
                    }
                }
                url = url.length() > 3 ? url.substring(3) : "";
            }
            return protocol + "://" + domain + currentUri + url;
        } else if (url.startsWith("./")) {
            int towIndex = currentUri.lastIndexOf("/");
            if (towIndex > 0) {
                currentUri = currentUri.substring(0, towIndex + 1);
            }
            url = url.length() > 2 ? url.substring(2) : "";
            while (url.startsWith("../")) {
                towIndex = StrUtil.ordinalIndexOf(StrUtil.reverse(currentUri), "/", 2);
                if (towIndex > 0) {
                    towIndex = currentUri.length() - towIndex;
                    if (towIndex > 0) {
                        currentUri = currentUri.substring(0, towIndex);
                    }
                }
                url = url.length() > 3 ? url.substring(3) : "";
            }
            return protocol + "://" + domain + currentUri + url;
        }
        return protocol + "://" + domain + currentUri + url;
    }

    /**
     * 解析Url链接
     *
     * @param url 链接地址
     * @return cn.itmtr.document.download.common.util.UrlProperty
     * @author mtr
     * @date 2021/10/4
     */
    public static UrlProperty resolveUrl(String url) {
        UrlProperty urlProperty = new UrlProperty();
        urlProperty.setTargetUrl(url);
        urlProperty.setProtocol(HttpUtil.isHttps(url) ? "https" : "http");

        url = url.substring(url.indexOf("://") + 3);
        int slashIndex = url.indexOf("/");
        int queryIndex = url.indexOf("?");
        if (queryIndex == -1) {
            queryIndex = url.length();
            urlProperty.setQuery("");
        } else {
            urlProperty.setQuery(url.substring(queryIndex));
        }
        if (slashIndex == -1) {
            slashIndex = queryIndex;
            urlProperty.setUri("");
        } else {
            urlProperty.setUri(url.substring(slashIndex, queryIndex));
        }
        String uri = urlProperty.getUri();
        if (uri.contains("#")) {
            urlProperty.setUri(uri.substring(0, uri.indexOf("#")));
        }
        urlProperty.setDomain(url.substring(0, slashIndex));
        return urlProperty;
    }

    /**
     * 解析链接
     *
     * @param url        链接
     * @param protocol   协议 http 或 https
     * @param domain     域名
     * @param currentUri 当前路径
     * @param typeEnum   类型
     * @return cn.itmtr.document.download.common.util.UrlProperty
     * @author mtr
     * @date 2021/10/4
     */
    public static UrlProperty resolveUrl(String url, String protocol, String domain, String currentUri, UrlPropertyTypeEnum typeEnum) {
        String visitUrl = getVisitUrl(url, protocol, domain, currentUri);
        UrlProperty urlProperty = resolveUrl(visitUrl);
        urlProperty.setSourceUrl(url);
        urlProperty.setSourceUrlKey(MD5.create().digestHex(url));
        urlProperty.setType(typeEnum.getType());
        String uri = urlProperty.getUri();
        if (HTML.equals(typeEnum)) {
            if (uri.endsWith("/")) {
                uri += "index.html";
            } else {
                String extName = FileUtil.extName(uri);
                if (StrUtil.isNotBlank(extName)) {
                    if ("htm".equals(extName)) {
                        extName = HTML.getType();
                    }
                    urlProperty.setType(extName);
                } else {
                    uri += "/index.html";
                }
            }
        }
        urlProperty.setPagePath(uri);
        return urlProperty;
    }

    /**
     * 替换html中的链接为FreeMark变量
     *
     * @param html       html内容
     * @param protocol   协议 http 或 https
     * @param domain     域名
     * @param currentUri 当前路径
     * @param list       存放找到的链接
     * @return java.lang.String
     * @author mtr
     * @date 2021/10/4
     */
    public static String filterUrl(String html, String protocol, String domain, String currentUri, List<UrlProperty> list) {
        Document document = Jsoup.parse(html);
        Elements aList = document.select("a");
        replaceElem(protocol, domain, currentUri, list, aList, "href", HTML);

        Elements linkList = document.select("link[rel='stylesheet']");
        replaceElem(protocol, domain, currentUri, list, linkList, "href", CSS);

        Elements iconList = document.select("link[rel='icon']");
        replaceElem(protocol, domain, currentUri, list, iconList, "href", IMAGE);

        Elements scriptList = document.select("script");
        replaceElem(protocol, domain, currentUri, list, scriptList, "src", JS);

        Elements imgList = document.select("img");
        replaceElem(protocol, domain, currentUri, list, imgList, "src", IMAGE);

        Elements audioList = document.select("audio");
        replaceElem(protocol, domain, currentUri, list, audioList, "src", AUDIO);

        Elements videoList = document.select("video");
        replaceElem(protocol, domain, currentUri, list, videoList, "src", VIDEO);

        return document.toString();
    }

    /**
     * 替换元素数据
     *
     * @param protocol   协议 http 或 https
     * @param domain     域名
     * @param currentUri 当前路径
     * @param list       存放找到的链接
     * @param elements   元素内容
     * @param attrKey    要替换的属性key
     * @param typeEnum   类型
     * @author mtr
     * @date 2021/10/4
     */
    private static void replaceElem(String protocol, String domain, String currentUri, List<UrlProperty> list, Elements elements, String attrKey, UrlPropertyTypeEnum typeEnum) {
        elements.forEach(elem -> {
            String url = elem.attr(attrKey);
            if (StrUtil.isNotBlank(url) && !url.startsWith("#") && !url.startsWith("javascript")) {
                UrlProperty urlProperty = resolveUrl(url, protocol, domain, currentUri, typeEnum);
                list.add(urlProperty);
                elem.attr(attrKey, "${resourcesMap['" + urlProperty.getSourceUrlKey() + "']!'" + url + "'}");
            }
        });
    }

    public static void main(String[] args) {
        String url = "https://www.layui.com/doc/index.html";
        String html = html(url);
        log.info("获取到html：{}", html);
        List<UrlProperty> allUrlProperty = new ArrayList<>(100);
        UrlProperty urlProperty = resolveUrl(url);
        log.info("获取到父级内容为：{}", JSON.toJSONString(urlProperty, true));
        String filterHtml = filterUrl(html, urlProperty.getProtocol(), urlProperty.getDomain(), urlProperty.getUri(), allUrlProperty);
        log.info("获取到html替换后的内容：{}", filterHtml);
        log.info("获取到所有链接为：{}", JSON.toJSONString(allUrlProperty, true));
    }

}
